@article{Wolfe_attributesVisual,
   author      = {J. M. Wolfe and T. S. Horowitz},
   journal   = {Nature Reviews Neuroscience},
   title       = {What attributes guide the deployment of visual attention and how do they do it?},
  pages =   "5:1--7",
   year        = {2004},
}



@article{Treisman_featureIntegration,
   author      = {Anne M Triesman and Garry Gelade},
   journal   = {Cognitive Psychology},
   title       = {A feature-integration theory of attention},
  volume = {12},
  number = {1},
  pages = {97--136},
   year = {1980},
}

@Article{KochVisualAttention,
  author =  "C. Koch and S. Ullman",
  year =    "1985",
  journal = "Human Neurbiology",
  pages =   "219--227",
  title =   "Shifts in selective visual attention: towards the underlying neural circuitry",
  volume =  "4",
}

@Article{ChengZMHH10,
  title = "RepFinder: Finding Approximately Repeated Scene Elements for Image Editing",
  author =  "\textbf{Ming-Ming Cheng} and Fang-Lue Zhang and Niloy J. Mitra and Xiaolei Huang and Shi-Min Hu ",
  journal = "ACM Trans. Graph.",
  volume =  "29",
  number =  "4",
  pages =   "83:1--8",
  year =    "2010",
}


@article{cheng2014imagespirit,
  author = {\textbf{Ming-Ming Cheng} and Zheng, Shuai and Lin, Wen-Yan and Vineet, Vibhav and Sturgess, Paul and Crook, Nigel and Mitra, Niloy J. and Torr, Philip},
  title = {ImageSpirit: Verbal Guided Image Parsing},
  journal = {ACM Trans. Graph.},
  volume = {34},
  number = {1},
  year = {2014},
  pages = {3:1--3:11},
  doi = {10.1145/2682628},
}


@article{liAutomaticGenerationHighPerformance2020,
  title={Automatic generation of high-performance fft kernels on arm and x86 cpus},
  author={Li, Zhihao and Jia, Haipeng and Zhang, Yunquan and Chen, Tun and Yuan, Liang and Vuduc, Richard},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  volume={31},
  number={8},
  pages={1925--1941},
  year={2020},
  publisher={IEEE}
}


@inproceedings{liAutoFFTTemplatebasedFFT2019a,
  title={AutoFFT: a template-based FFT codes auto-generation framework for ARM and X86 CPUs},
  author={Li, Zhihao and Jia, Haipeng and Zhang, Yunquan and Chen, Tun and Yuan, Liang and Cao, Luning and Wang, Xiao},
  booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--15},
  year={2019}
}

@inproceedings{chen2019efficient,
  title={An efficient implementation of the ALS-WR algorithm on x86 CPUs},
  author={Chen, Maosen and Chen, Tun and Chen, Qianyun},
  booktitle={International Symposium on Benchmarking, Measuring and Optimization},
  pages={116--122},
  year={2019},
  organization={Springer}
}


@inproceedings{chen2021transpose,
  title={A transpose-free three-dimensional FFT algorithm on ARM CPUs},
  author={Chen, Tun and Jia, Haipeng and Li, Zhihao and Li, Chendi and Zhang, Yunquan},
  booktitle={2021 IEEE 23rd Int Conf on High Performance Computing \& Communications; 7th Int Conf on Data Science \& Systems; 19th Int Conf on Smart City; 7th Int Conf on Dependability in Sensor, Cloud \& Big Data Systems \& Application (HPCC/DSS/SmartCity/DependSys)},
  pages={1--8},
  year={2021},
  organization={IEEE}
}

@article{chen2023openfft,
  title={OpenFFT: An Adaptive Tuning Framework of 3D FFT on Multicore ARM CPUs},
  author={Chen, Tun and Jia, Haipeng and Zhang, Yunquan and Li, Kun and Li, Zhihao and Zhao, Xiang and Li, Chendi},
  year={2023}
}

@article{zhenghongxiang2005,
  title={预警信号发布工作之我见},
  author={郑宏翔 and 谭凌志},
  journal={气象研究与应用},
  volume={26},
  number={4},
  pages={57-59},
  year={2005},
}

@article{bougeault2010thorpex,
  title={The THORPEX interactive grand global ensemble},
  author={Bougeault, Philippe and Toth, Zoltan and Bishop, Craig and Brown, Barbara and Burridge, David and Chen, De Hui and Ebert, Beth and Fuentes, Manuel and Hamill, Thomas M and Mylne, Ken and others},
  journal={Bulletin of the American Meteorological Society},
  volume={91},
  number={8},
  pages={1059--1072},
  year={2010},
  publisher={American Meteorological Society}
}

@article{chentun2019base,
  title={基于 ARMv8 平台的多维 FFT 实现与优化研究},
  author={陈暾 and 李志豪 and 贾海鹏 and 张云泉},
  journal={计算机学报},
  volume={42},
  number={11},
  pages={2384--2402},
  year={2019}
}

@book{wu2015global,
  author    = {吴建平 and 银福康 and 彭军 and 杨锦辉 and 阳向荣 and 汪祥},
  title     = {全球数值天气预报谱模式技术},
  publisher = {气象出版社},
  year      = {2023},
  isbn      = {978-7-5029-8000-9},
}

@book{pattersonComputerOrganizationDesign2008,
  title={全球数值天气预报谱模式技术},
  author={吴建平 and 银福康 and 彭军 and 杨锦辉 and 阳向荣 and 汪祥},
  year={2016},
  publisher={Morgan kaufmann}
}

@article{goodacre2005parallelism,
  title={Parallelism and the ARM instruction set architecture},
  author={Goodacre, John and Sloss, Andrew N},
  journal={Computer},
  volume={38},
  number={7},
  pages={42--50},
  year={2005},
  publisher={IEEE}
}

@misc{ArmPerformanceLibraries,
 title = {Arm Performance Libraries Reference Guide Release Information},
    author = {Arm},
urldate = {2023-04-23},
 url = {https://developer.arm.com/documentation/101004/latest/},
}


@misc{IntrinsicsArmDeveloper,
 title = {Intrinsics – Arm Developer},
    author = {Arm},
  urldate = {2023-05-28},
 url = {https://developer.arm.com/architectures/instruction-sets/intrinsics/},
}


@misc{ARMCompilerArmasm,
	title = {{ARM} {Compiler} armasm {User} {Guide} {Version} 6.6},
  author = {Arm},
	url = {https://developer.arm.com/documentation/dui0801/g/},
	urldate = {2023-05-28},
}

@article{张云泉2020高性能计算多层次不连续非线性可扩展现象研究,
  title={高性能计算多层次不连续非线性可扩展现象研究},
  author={张云泉 and 袁良 and 陈一峯 and 冯晓兵 and 张贺},
  journal={计算机学报},
  volume={43},
  number={6},
  pages={17},
  year={2020},
}

@article{wedi2014increasing,
  title={Increasing horizontal resolution in numerical weather prediction and climate simulations: illusion or panacea?},
  author={Wedi, Nils P},
  journal={Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences},
  volume={372},
  number={2018},
  pages={20130289},
  year={2014},
  publisher={The Royal Society Publishing}
}

@article{mozdzynski2015partitioned,
  title={A partitioned global address space implementation of the European centre for medium range weather forecasts integrated forecasting system},
  author={Mozdzynski, George and Hamrud, Mats and Wedi, Nils},
  journal={The International Journal of High Performance Computing Applications},
  volume={29},
  number={3},
  pages={261--273},
  year={2015},
  publisher={SAGE Publications Sage UK: London, England}
}

@article{wedi2015modelling,
  title={The modelling infrastructure of the Integrated Forecasting System: Recent advances and future challenges},
  author={Wedi, NP and Bauer, P and Denoninck, W and Diamantakis, M and Hamrud, M and Kuhnlein, C and Malardel, S and Mogensen, K and Mozdzynski, G and Smolarkiewicz, PK},
  year={2015},
  publisher={European Centre for Medium-Range Weather Forecasts}
}

@book{ehrendorfer2011spectral,
  title={Spectral numerical weather prediction models},
  author={Ehrendorfer, Martin},
  year={2011},
  publisher={SIAM}
}

@book{ames2014numerical,
  title={Numerical methods for partial differential equations},
  author={Ames, William F},
  year={2014},
  publisher={Academic press}
}

@article{yin2018performance,
  title={Performance evaluation of the fast spherical harmonic transform algorithm in the Yin--He global spectral model},
  author={Yin, Fukang and Wu, Guoli and Wu, Jianping and Zhao, Jun and Song, Junqiang},
  journal={Monthly Weather Review},
  volume={146},
  number={10},
  pages={3163--3182},
  year={2018}
}

@misc{ECMWF1111,
	title = {Integrated Forecasting System},
  author = {ECMWF},
	url = {https://www.ecmwf.int/en/forecasts/documentation-and-support/changes-ecmwf-model},
}

@article{mizuta200620,
  title={20-km-mesh global climate simulations using JMA-GSM model—mean climate states—},
  author={Mizuta, Ryo and Oouchi, Kazuyoshi and Yoshimura, Hiromasa and Noda, Akira and Katayama, Keiichi and Yukimoto, Seiji and Hosaka, Masahiro and Kusunoki, Shoji and Kawai, Hideaki and Nakagawa, Masayuki},
  journal={Journal of the Meteorological Society of Japan. Ser. II},
  volume={84},
  number={1},
  pages={165--185},
  year={2006},
  publisher={Meteorological Society of Japan}
}

@misc{77382,
  keywords = {workshop, High performance computing, Cray},
  author = {J. Hague},
  title = {IFS Vectorisation Improvements Using Cray Supercomputer},
  year = {2014},
  month = {2014},
  language = {eng},
}

@article{duben2018progress,
  title={Progress in using single precision in the IFS},
  author={D{\"u}ben, PD and Diamantakis, M and Lang, S and Saarinen, S and Sandu, I and Wedi, N and Wilhelmsson, T},
  journal={ECMWF Newsletter},
  volume={157},
  pages={26--31},
  year={2018}
}



@misc{noauthor_ecmwf-ifsdwarf-p-cloudsc_2025,
	title = {ecmwf-ifs/dwarf-p-cloudsc},
	copyright = {Apache-2.0},
	url = {https://github.com/ecmwf-ifs/dwarf-p-cloudsc},
	abstract = {Standalone mini-app of the ECMWF cloud microphysics parameterization},
	urldate = {2025-03-01},
	publisher = {European Centre for Medium-Range Weather Forecasts (IFS)},
	month = feb,
	year = {2025},
	note = {original-date: 2021-11-23T10:16:46Z},
}

@misc{noauthor_tau_nodate,
	title = {{TAU} - {Tuning} and {Analysis} {Utilities} -},
	url = {https://www.cs.uoregon.edu/research/tau/home.php},
	urldate = {2025-03-01},
	file = {TAU - Tuning and Analysis Utilities -:C\:\\Users\\chentun\\Zotero\\storage\\9P3K5KLN\\home.html:text/html},
}


//wu bib
@article{worley1992parallelizing,
  title={Parallelizing the spectral transform method},
  author={Worley, Patrick H and Drake, John B},
  journal={Concurrency: Practice and Experience},
  volume={4},
  number={4},
  pages={269--291},
  year={1992},
  publisher={Wiley Online Library}
}


@article{foster1992parallel,
  title={The parallel scalability of the spectral transform method},
  author={Foster, Ian and Gropp, William and Stevens, Rick},
  journal={Monthly Weather Review},
  volume={120},
  number={5},
  pages={835--850},
  year={1992}
}

@article{barros1994parallelization,
  title={On the parallelization of global spectral weather models},
  author={Barros, Saulo RM and Kauranne, Tuomo},
  journal={Parallel computing},
  volume={20},
  number={9},
  pages={1335--1356},
  year={1994},
  publisher={Elsevier}
}

@inproceedings{worley1994parallel,
  title={Parallel spectral transform shallow water model: A runtime-tunable parallel benchmark code},
  author={Worley, PH and Foster, IT},
  booktitle={Proceedings of IEEE Scalable High Performance Computing Conference},
  pages={207--214},
  year={1994},
  organization={IEEE}
}

@inproceedings{mirin2007extending,
  title={Extending scalability of the community atmosphere model},
  author={Mirin, Art and Worley, Pat},
  booktitle={Journal of Physics: Conference Series},
  volume={78},
  number={1},
  pages={012082},
  year={2007},
  organization={IOP Publishing}
}

@article{drake1996parallel,
  title={Parallel community climate model: Description and user's guide},
  author={Drake, J and Flanery, R and Semeraro, B and Worley, P and Foster, I and Michalakes, J and Hack, J and Williamson, D},
  journal={Oak Ridge National Laboratory, Tech. Rep. ORNL/TM-12285},
  year={1996}
}

@article{barros1995ifs,
  title={The IFS model: A parallel production weather code},
  author={Barros, Saulo R. M. and Dent, David and Isaksen, Lars and Robinson, G and Mozdzynski, G and Wollenweber, F},
  journal={Parallel Computing},
  volume={21},
  number={10},
  pages={1621--1638},
  year={1995},
  publisher={Elsevier}
}

@article{rivier2002efficient,
  title={An efficient spectral dynamical core for distributed memory computers},
  author={Rivier, L and Loft, R and Polvani, LM},
  journal={Monthly weather review},
  volume={130},
  number={5},
  pages={1384--1396},
  year={2002}
}


//jia bib
@inproceedings{yang2021libshalom,
  title={LIBSHALOM: Optimizing small and irregular-shaped matrix multiplications on ARMv8 multi-cores},
  author={Yang, Weiling and Fang, Jianbin and Dong, Dezun and Su, Xing and Wang, Zheng},
  booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--14},
  year={2021}
}

@inproceedings{meng2019pattern,
  title={A pattern based algorithmic autotuner for graph processing on GPUs},
  author={Meng, Ke and Li, Jiajia and Tan, Guangming and Sun, Ninghui},
  booktitle={Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
  pages={201--213},
  year={2019}
}

@inproceedings{wang2020efficient,
  title={An efficient and non-intrusive GPU scheduling framework for deep learning training systems},
  author={Wang, Shaoqi and Gonzalez, Oscar J and Zhou, Xiaobo and Williams, Thomas and Friedman, Brian D and Havemann, Martin and Woo, Thomas},
  booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--13},
  year={2020},
  organization={IEEE}
}

@article{li2020automatic,
  title={Automatic generation of high-performance fft kernels on arm and x86 cpus},
  author={Li, Zhihao and Jia, Haipeng and Zhang, Yunquan and Chen, Tun and Yuan, Liang and Vuduc, Richard},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  volume={31},
  number={8},
  pages={1925--1941},
  year={2020},
  publisher={IEEE}
}

@inproceedings{yao2021iaat,
  title={Iaat: A input-aware adaptive tuning framework for small gemm},
  author={Yao, Jianyu and Shi, Boqian and Xiang, Chunyang and Jia, Haipeng and Li, Chendi and Cao, Hang and Zhang, Yunquan},
  booktitle={2021 IEEE 27th International Conference on Parallel and Distributed Systems (ICPADS)},
  pages={899--906},
  year={2021},
  organization={IEEE}
}

@inproceedings{winter2019adaptive,
  title={Adaptive sparse matrix-matrix multiplication on the GPU},
  author={Winter, Martin and Mlakar, Daniel and Zayer, Rhaleb and Seidel, Hans-Peter and Steinberger, Markus},
  booktitle={Proceedings of the 24th symposium on principles and practice of parallel programming},
  pages={68--81},
  year={2019}
}

@inproceedings{hong2019adaptive,
  title={Adaptive sparse tiling for sparse matrix multiplication},
  author={Hong, Changwan and Sukumaran-Rajam, Aravind and Nisa, Israt and Singh, Kunal and Sadayappan, Ponnuswamy},
  booktitle={Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
  pages={300--314},
  year={2019}
}

@inproceedings{luo2015fast,
  title={Fast: A fast stencil autotuning framework based on an optimal-solution space model},
  author={Luo, Yulong and Tan, Guangming and Mo, Zeyao and Sun, Ninghui},
  booktitle={Proceedings of the 29th ACM on International Conference on Supercomputing},
  pages={187--196},
  year={2015}
}

@inproceedings{li2013smat,
  title={SMAT: An input adaptive auto-tuner for sparse matrix-vector multiplication},
  author={Li, Jiajia and Tan, Guangming and Chen, Mingyu and Sun, Ninghui},
  booktitle={Proceedings of the 34th ACM SIGPLAN conference on Programming language design and implementation},
  pages={117--126},
  year={2013}
}

@inproceedings{elafrou2019basmat,
  title={BASMAT: Bottleneck-aware sparse matrix-vector multiplication auto-tuning on GPGPUs},
  author={Elafrou, Athena and Goumas, Georgios and Koziris, Nectarios},
  booktitle={Proceedings of the 24th Symposium on Principles and Practice of Parallel Programming},
  pages={423--424},
  year={2019}
}

@inproceedings{sharif2021approxtuner,
  title={ApproxTuner: a compiler and runtime system for adaptive approximations},
  author={Sharif, Hashim and Zhao, Yifan and Kotsifakou, Maria and Kothari, Akash and Schreiber, Ben and Wang, Elizabeth and Sarita, Yasmin and Zhao, Nathan and Joshi, Keyur and Adve, Vikram S and others},
  booktitle={Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  pages={262--277},
  year={2021}
}

@article{gong1998adaptive,
  title={Adaptive tuning of numerical weather prediction models: Simultaneous estimation of weighting, smoothing, and physical parameters},
  author={Gong, Jianjian and Wahba, Grace and Johnson, Donald R and Tribbia, Joseph},
  journal={Monthly Weather Review},
  volume={126},
  number={1},
  pages={210--231},
  year={1998}
}

@article{asanovicParallelComputingLaboratory,
  title={The parallel computing laboratory at UC Berkeley: A research agenda based on the Berkeley view},
  author={Asanovic, Krste and Bodik, Ras and Demmel, James and Keaveny, Tony and Keutzer, Kurt and Kubiatowicz, John D and Lee, Edward A and Morgan, Nelson and Necula, George and Patterson, David A and others},
  journal={EECS Department, University of California, Berkeley, Tech. Rep},
  year={2008}
}


@article{flynn1972some,
  title={Some computer organizations and their effectiveness},
  author={Flynn, Michael J},
  journal={IEEE transactions on computers},
  volume={100},
  number={9},
  pages={948--960},
  year={1972},
  publisher={IEEE}
}


@article{yin2021implementation,
  title={An implementation of single-precision fast spherical harmonic transform in Yin--He global spectral model},
  author={Yin, Fukang and Song, Junqiang and Wu, Jianping and Zhang, Weimin},
  journal={Quarterly Journal of the Royal Meteorological Society},
  volume={147},
  number={737},
  pages={2323--2334},
  year={2021},
  publisher={Wiley Online Library}
}


@article{frigoDesignImplementationFFTW32005,
  title={The design and implementation of FFTW3},
  author={Frigo, Matteo and Johnson, Steven G},
  journal={Proceedings of the IEEE},
  volume={93},
  number={2},
  pages={216--231},
  year={2005},
  publisher={IEEE}
}

@inproceedings{frigoFFTWAdaptiveSoftware1998,
  title={FFTW: An adaptive software architecture for the FFT},
  author={Frigo, Matteo and Johnson, Steven G},
  booktitle={Proceedings of the 1998 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP'98 (Cat. No. 98CH36181)},
  volume={3},
  pages={1381--1384},
  year={1998},
  organization={IEEE}
}

@article{pennycook2019implications,
  title={Implications of a metric for performance portability},
  author={Pennycook, Simon J and Sewall, Jason D and Lee, Victor W},
  journal={Future Generation Computer Systems},
  volume={92},
  pages={947--958},
  year={2019},
  publisher={Elsevier}
}


@inproceedings{mendez2014climate,
  title={Climate models: challenges for Fortran development tools},
  author={M{\'e}ndez, Mariano and Tinetti, Fernando G and Overbey, Jeffrey L},
  booktitle={2014 Second International Workshop on Software Engineering for High Performance Computing in Computational Science and Engineering},
  pages={6--12},
  year={2014},
  organization={IEEE}
}

@article{schulthess2018reflecting,
  title={Reflecting on the goal and baseline for exascale computing: a roadmap based on weather and climate simulations},
  author={Schulthess, Thomas C and Bauer, Peter and Wedi, Nils and Fuhrer, Oliver and Hoefler, Torsten and Sch{\"a}r, Christoph},
  journal={Computing in Science \& Engineering},
  volume={21},
  number={1},
  pages={30--41},
  year={2018},
  publisher={IEEE}
}

@incollection{lapillonne2017using,
  title={Using OpenACC to port large legacy climate and weather modeling code to GPUs},
  author={Lapillonne, Xavier and Osterried, Katherine and Fuhrer, Oliver},
  booktitle={Parallel Programming with OpenACC},
  pages={267--290},
  year={2017},
  publisher={Elsevier}
}

@misc{schar2020kilometer,
  title={Kilometer-scale climate models: Prospects and challenges, B. Am. Meteorol. Soc., 101, E567--E587},
  author={Sch{\"a}r, C and Fuhrer, O and Arteaga, A and Ban, N and Charpilloz, C and Di Girolamo, S and Hentgen, L and Hoefler, T and Lapillonne, X and Leutwyler, D and others},
  year={2020}
}

@misc{ecmwf_ifs_2023,
	type = {text},
	title = {{IFS} {Documentation} {CY48R1} - {Part} {IV}: {Physical} {Processes}},
	shorttitle = {{IFS} {Documentation} {CY48R1} - {Part} {IV}},
	url = {https://www.ecmwf.int/en/elibrary/81370-ifs-documentation-cy48r1-part-iv-physical-processes},
	abstract = {Chapter 1 Overview Chapter 2 Radiation Chapter 3 Turbulent transport and interactions with the surface Chapter 4 Subgrid-scale orographic drag Chapter 5 Non-orographic gravity wave drag Chapter 6 Convection Chapter 7 Clouds and large-scale precipitation Chapter 8 Surface parametrization Chapter 9 Methane oxidation Chapter 10 Ozone chemistry parametrization Chapter 11 Climatological data Chapter 12 Basic physical constants and thermodynamic functions},
	language = {en},
	urldate = {2025-06-16},
	journal = {ECMWF},
	author = {ECMWF},
	year = {2023},
	file = {Snapshot:C\:\\Users\\chentun\\Zotero\\storage\\B84DUXEI\\81370-ifs-documentation-cy48r1-part-iv-physical-processes.html:text/html},
}

@article{lawrence2018crossing,
  title={Crossing the chasm: how to develop weather and climate models for next generation computers?},
  author={Lawrence, Bryan N and Rezny, Michael and Budich, Reinhard and Bauer, Peter and Behrens, J{\"o}rg and Carter, Mick and Deconinck, Willem and Ford, Rupert and Maynard, Christopher and Mullerworth, Steven and others},
  journal={Geoscientific Model Development},
  volume={11},
  number={5},
  pages={1799--1821},
  year={2018},
  publisher={Copernicus Publications G{\"o}ttingen, Germany}
}

@misc{noauthor_ecmwf-ifsloki_2025,
	title = {ecmwf-ifs/loki},
	copyright = {Apache-2.0},
	url = {https://github.com/ecmwf-ifs/loki},
	abstract = {Freely programmable source-to-source translation for Fortran},
	urldate = {2025-07-02},
	publisher = {European Centre for Medium-Range Weather Forecasts (IFS)},
	month = jun,
	year = {2025},
	note = {original-date: 2022-11-02T14:24:41Z},
	keywords = {fortran, gpu, hpc, loki, python, source-to-source},
}

@article{clement2019automatic,
  title={Automatic Port to OpenACC/OpenMP for Physical Parameterization in Climate and Weather Code Using the CLAW Compiler},
  author={Clement, Valentin and Marti, Philippe and Lapillonne, Xavier and Fuhrer, Oliver and Sawyer, William},
  journal={Supercomputing Frontiers and Innovations},
  volume={6},
  number={3},
  pages={51--63},
  year={2019}
}

@article{dahm2023pace,
  title={Pace v0. 2: a Python-based performance-portable atmospheric model},
  author={Dahm, Johann and Davis, Eddie and Deconinck, Florian and Elbert, Oliver and George, Rhea and McGibbon, Jeremy and Wicky, Tobias and Wu, Elynn and Kung, Christopher and Ben-Nun, Tal and others},
  journal={Geoscientific Model Development},
  volume={16},
  number={9},
  pages={2719--2736},
  year={2023},
  publisher={Copernicus Publications G{\"o}ttingen, Germany}
}

@article{harris2020array,
  title={Array programming with NumPy},
  author={Harris, Charles R and Millman, K Jarrod and Van Der Walt, St{\'e}fan J and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J and others},
  journal={Nature},
  volume={585},
  number={7825},
  pages={357--362},
  year={2020},
  publisher={Nature Publishing Group UK London}
}

@article{afanasyev2021gridtools,
  title={Gridtools: A framework for portable weather and climate applications},
  author={Afanasyev, Anton and Bianco, Mauro and Mosimann, Lukas and Osuna, Carlos and Thaler, Felix and Vogt, Hannes and Fuhrer, Oliver and VandeVondele, Joost and Schulthess, Thomas C},
  journal={SoftwareX},
  volume={15},
  pages={100707},
  year={2021},
  publisher={Elsevier}
}

@inproceedings{ben2019stateful,
  title={Stateful dataflow multigraphs: A data-centric model for performance portability on heterogeneous architectures},
  author={Ben-Nun, Tal and de Fine Licht, Johannes and Ziogas, Alexandros N and Schneider, Timo and Hoefler, Torsten},
  booktitle={Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--14},
  year={2019}
}

@mastersthesis{martin2023dace,
  title={DaCe on GPU for Climate and Weather Models Using CLOUDSC as a Case Study},
  author={Martin, Samuel},
  year={2023},
  school={ETH Zurich}
}

@article{ubbiali2024exploring,
  title={Exploring a high-level programming model for the NWP domain using ECMWF microphysics schemes},
  author={Ubbiali, Stefano and K{\"u}hnlein, Christian and Sch{\"a}r, Christoph and Schlemmer, Linda and Schulthess, Thomas C and Staneker, Michael and Wernli, Heini},
  journal={Geoscientific Model Development Discussions},
  volume={2024},
  pages={1--30},
  year={2024},
  publisher={G{\"o}ttingen, Germany}
}

@article{muller2019escape,
  title={The ESCAPE project: energy-efficient scalable algorithms for weather prediction at exascale},
  author={M{\"u}ller, Andreas and Deconinck, Willem and K{\"u}hnlein, Christian and Mengaldo, Gianmarco and Lange, Michael and Wedi, Nils and Bauer, Peter and Smolarkiewicz, Piotr K and Diamantakis, Michail and Lock, Sarah-Jane and others},
  journal={Geoscientific Model Development},
  volume={12},
  number={10},
  pages={4425--4441},
  year={2019},
  publisher={Copernicus GmbH}
}


@misc{noauthor_sve_nodate,
	title = {{SVE} {Optimization} {Guide}},
	url = {https://developer.arm.com/documentation/102699/0100/Optimizing-with-intrinsics},
	urldate = {2025-07-08},
}

@misc{hague_ifs_2014,
	title = {{IFS} {Vectorisation} {Improvements} {Using} {Cray} {Supercomputer}},
	language = {eng},
	author = {Hague, J.},
	year = {2014},
	keywords = {Cray, High performance computing, workshop},
}

@article{schulthess_reflecting_2019,
	title = {Reflecting on the {Goal} and {Baseline} for {Exascale} {Computing}: {A} {Roadmap} {Based} on {Weather} and {Climate} {Simulations}},
	volume = {21},
	issn = {1558-366X},
	shorttitle = {Reflecting on the {Goal} and {Baseline} for {Exascale} {Computing}},
	url = {https://ieeexplore.ieee.org/document/8586949},
	doi = {10.1109/MCSE.2018.2888788},
	abstract = {We present a roadmap towards exascale computing based on true application performance goals. It is based on two state-of-the art European numerical weather prediction models (IFS from ECMWF and COSMO from MeteoSwiss) and their current performance when run at very high spatial resolution on present-day supercomputers. We conclude that these models execute about 100–250 times too slow for operational throughput rates at a horizontal resolution of 1 km, even when executed on a full petascale system with nearly 5000 state-of-the-art hybrid GPU-CPU nodes. Our analysis of the performance in terms of a metric that assesses the efficiency of memory use shows a path to improve the performance of hardware and software in order to meet operational requirements early next decade.},
	number = {1},
	urldate = {2025-07-08},
	journal = {Computing in Science \& Engineering},
	author = {Schulthess, Thomas C. and Bauer, Peter and Wedi, Nils and Fuhrer, Oliver and Hoefler, Torsten and Schär, Christoph},
	month = jan,
	year = {2019},
	keywords = {Computational modeling, Exascale computing, High performance computing, Numerical models, Spatial resolution, Supercomputers, Weather forecasting},
	pages = {30--41},
	file = {Snapshot:C\:\\Users\\chentun\\Zotero\\storage\\496AC6J4\\8586949.html:text/html},
}

@article{bauer_digital_2021,
	title = {The digital revolution of {Earth}-system science},
	volume = {1},
	issn = {2662-8457},
	url = {https://doi.org/10.1038/s43588-021-00023-0},
	doi = {10.1038/s43588-021-00023-0},
	abstract = {Computational science is crucial for delivering reliable weather and climate predictions. However, despite decades of high-performance computing experience, there is serious concern about the sustainability of this application in the post-Moore/Dennard era. Here, we discuss the present limitations in the field and propose the design of a novel infrastructure that is scalable and more adaptable to future, yet unknown computing architectures.},
	number = {2},
	journal = {Nature Computational Science},
	author = {Bauer, Peter and Dueben, Peter D. and Hoefler, Torsten and Quintino, Tiago and Schulthess, Thomas C. and Wedi, Nils P.},
	month = feb,
	year = {2021},
	pages = {104--113},
}


@article{dagum_openmp_1998,
	title = {{OpenMP}: an industry standard {API} for shared-memory programming},
	volume = {5},
	issn = {1558-190X},
	shorttitle = {{OpenMP}},
	url = {https://ieeexplore.ieee.org/document/660313/citations},
	doi = {10.1109/99.660313},
	abstract = {At its most elemental level, OpenMP is a set of compiler directives and callable runtime library routines that extend Fortran (and separately, C and C++ to express shared memory parallelism. It leaves the base language unspecified, and vendors can implement OpenMP in any Fortran compiler. Naturally, to support pointers and allocatables, Fortran 90 and Fortran 95 require the OpenMP implementation to include additional semantics over Fortran 77. OpenMP leverages many of the X3H5 concepts while extending them to support coarse grain parallelism. The standard also includes a callable runtime library with accompanying environment variables.},
	number = {1},
	urldate = {2025-07-08},
	journal = {IEEE Computational Science and Engineering},
	author = {Dagum, L. and Menon, R.},
	month = jan,
	year = {1998},
	keywords = {ANSI standards, Coherence, Computer architecture, Hardware, Message passing, Parallel processing, Parallel programming, Power system modeling, Scalability, Software systems},
	pages = {46--55},
}

@book{1055553175812,
author = {Chandrasekaran, Sunita and Juckeland, Guido},
title = {OpenACC for Programmers: Concepts and Strategies},
year = {2017},
isbn = {0134694287},
publisher = {Addison-Wesley Professional},
edition = {1st},
abstract = {The Complete Guide to OpenACC for Massively Parallel Programming Scientists and technical professionals can use OpenACC to leverage the immense power of modern GPUs without the complexity traditionally associated with programming them. OpenACC for Programmers is one of the first comprehensive and practical overviews of OpenACC for massively parallel programming. This book integrates contributions from 19 leading parallel-programming experts from academia, public research organizations, and industry. The authors and editors explain each key concept behind OpenACC, demonstrate how to use essential OpenACC development tools, and thoroughly explore each OpenACC feature set. Throughout, youll find realistic examples, hands-on exercises, and case studies showcasing the efficient use of OpenACC language constructs. Youll discover how OpenACCs language constructs can be translated to maximize application performance, and how its standard interface can target multiple platforms via widely used programming languages. Each chapter builds on what youve already learned, helping you build practical mastery one step at a time, whether youre a GPU programmer, scientist, engineer, or student. All example code and exercise solutions are available for download at GitHub. Discover how OpenACC makes scalable parallel programming easier and more practical Walk through the OpenACC spec and learn how OpenACC directive syntax is structured Get productive with OpenACC code editors, compilers, debuggers, and performance analysis tools Build your first real-world OpenACC programs Exploit loop-level parallelism in OpenACC, understand the levels of parallelism available, and maximize accuracy or performance Learn how OpenACC programs are compiled Master OpenACC programming best practices Overcome common performance, portability, and interoperability challenges Efficiently distribute tasks across multiple processors Register your product at informit.com/register for convenient access to downloads, updates, and/or corrections as they become available. Normal 0 false false false EN-US X-NONE X-NONE}
}

@article{beazley_david_nodate,
	title = {David {Beazley} http://www.dabeaz.com {June} 11, 2009 @ chipy},
	language = {en},
	author = {Beazley, David},
	file = {Beazley - David Beazley httpwww.dabeaz.com June 11, 2009 .pdf:C\:\\Users\\chentun\\Zotero\\storage\\Z2PKYT47\\Beazley - David Beazley httpwww.dabeaz.com June 11, 2009 .pdf:application/pdf},
}

@article{fang2021performance,
  title={Performance evaluation of memory-centric armv8 many-core architectures: A case study with phytium 2000+},
  author={Fang, Jian-Bin and Liao, Xiang-Ke and Huang, Chun and Dong, De-Zun},
  journal={Journal of Computer Science and Technology},
  volume={36},
  pages={33--43},
  year={2021},
  publisher={Springer}
}

@article{xia2021kunpeng,
  title={Kunpeng 920: The first 7-nm chiplet-based 64-core arm soc for cloud services},
  author={Xia, Jing and Cheng, Chuanning and Zhou, Xiping and Hu, Yuxing and Chun, Peter},
  journal={IEEE Micro},
  volume={41},
  number={5},
  pages={67--75},
  year={2021},
  publisher={IEEE}
}

@inproceedings{sato2020co,
  title={Co-design for a64fx manycore processor and” fugaku”},
  author={Sato, Mitsuhisa and Ishikawa, Yutaka and Tomita, Hirofumi and Kodama, Yuetsu and Odajima, Tetsuya and Tsuji, Miwako and Yashiro, Hisashi and Aoki, Masaki and Shida, Naoyuki and Miyoshi, Ikuo and others},
  booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--15},
  year={2020},
  organization={IEEE}
}

@article{allen1987automatic,
  title={Automatic translation of Fortran programs to vector form},
  author={Allen, Randy and Kennedy, Ken},
  journal={ACM Transactions on Programming Languages and Systems (TOPLAS)},
  volume={9},
  number={4},
  pages={491--542},
  year={1987},
  publisher={ACM New York, NY, USA}
}

@inproceedings{hampton2008compiling,
  title={Compiling for vector-thread architectures},
  author={Hampton, Mark and Asanovic, Krste},
  booktitle={Proceedings of the 6th annual IEEE/ACM international symposium on Code generation and optimization},
  pages={205--215},
  year={2008}
}

@book{kennedy2001optimizing,
  title={Optimizing compilers for modern architectures: a dependence-based approach},
  author={Kennedy, Ken and Allen, John R},
  year={2001},
  publisher={Morgan Kaufmann Publishers Inc.}
}

@inproceedings{maleki2011evaluation,
  title={An evaluation of vectorizing compilers},
  author={Maleki, Saeed and Gao, Yaoqing and Garzar{\'a}n, Maria J and Wong, Tommy and Padua, David A},
  booktitle={2011 International Conference on Parallel Architectures and Compilation Techniques},
  pages={372--382},
  year={2011},
  organization={IEEE}
}

@article{song1999new,
  title={New tiling techniques to improve cache temporal locality},
  author={Song, Yonghong and Li, Zhiyuan},
  journal={ACM SIGPLAN Notices},
  volume={34},
  number={5},
  pages={215--228},
  year={1999},
  publisher={ACM New York, NY, USA}
}

@article{coleman1995tile,
  title={Tile size selection using cache organization and data layout},
  author={Coleman, Stephanie and McKinley, Kathryn S},
  journal={ACM SIGPLAN Notices},
  volume={30},
  number={6},
  pages={279--290},
  year={1995},
  publisher={ACM New York, NY, USA}
}

@article{wolf1991loop,
  title={A loop transformation theory and an algorithm to maximize parallelism},
  author={Wolf, Michael E and Lam, Monica S},
  journal={IEEE Transactions on Parallel \& Distributed Systems},
  volume={2},
  number={04},
  pages={452--471},
  year={1991},
  publisher={IEEE Computer Society}
}

@article{mckinley1996improving,
  title={Improving data locality with loop transformations},
  author={McKinley, Kathryn S and Carr, Steve and Tseng, Chau-Wen},
  journal={ACM Transactions on Programming Languages and Systems (TOPLAS)},
  volume={18},
  number={4},
  pages={424--453},
  year={1996},
  publisher={ACM New York, NY, USA}
}

@article{gao2023wrbench,
  title={wrbench: Comparing cache architectures and coherency protocols on armv8 many-core systems},
  author={Gao, Wan-Rong and Fang, Jian-Bin and Huang, Chun and Xu, Chuan-Fu and Wang, Zheng},
  journal={Journal of Computer Science and Technology},
  volume={38},
  number={6},
  pages={1323--1338},
  year={2023},
  publisher={Springer}
}